package com.diven.spark.ml.learn.feature

import org.apache.spark.sql.{DataFrame}
import org.apache.spark.sql.functions._
import org.apache.spark.ml.feature.Binarizer
import org.apache.spark.sql.types.DataTypes
import org.apache.spark.ml.feature.Bucketizer
import com.diven.spark.ml.learn.core.BaseTest
import com.diven.spark.ml.learn.core.BaseSpark

object BucketizerTest extends BaseTest {
    
    def apply(): BaseSpark = new BucketizerTest()
    
}

class BucketizerTest extends BaseSpark{
    
    override def execute(dataFrame: DataFrame) = {
        //特征名称
        var feature = "weight"
        var feature_new = "weight_bucketizer"
        //分箱点[前闭后开]
        var splits: Array[Double] = Array(Double.MinValue, 20, 50, 70, Double.MaxValue)
        //数据预处理
        var dataset = dataFrame.select(col(feature).cast(DataTypes.DoubleType))
        //特征二值化
        var transform = new Bucketizer()
        .setInputCol(feature)         //待变换的特征
        .setOutputCol(feature_new)    //变换后的特征名称
        .setSplits(splits)            //分箱点[前闭后开]
        .setHandleInvalid("skip")     //无效条目的处理方式[跳过]
        .transform(dataset)
        //show
        transform.show()
    }
    
}
